{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import random\n",
    "import datetime\n",
    "import pyspark"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "sc = pyspark.SparkContext()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "x = [chr(random.randint(65,91)) for i in range(1000000)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "rdd = sc.parallelize(x)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 两种不同的编写方式，执行的时间是一样 "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 第一种模式，可以看见rdd2定义了一次，被使用了两次：\n",
    "### 两次都是用同一个rdd做reduceByKey"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0:00:10.367666\n",
      "-------------\n",
      "0:00:09.702549\n",
      "-------------\n",
      "0:00:20.070215\n",
      "#############\n",
      "[('N', 36921), ('J', 36963), ('O', 37176), ('[', 37147), ('H', 37025), ('B', 37037), ('R', 36973), ('W', 37438), ('C', 36989), ('Z', 37157), ('V', 36778), ('X', 36705), ('I', 37165), ('K', 36993), ('L', 37151), ('Q', 37044), ('T', 36665), ('U', 37025), ('A', 37126), ('F', 36657), ('M', 37224), ('S', 37248), ('Y', 36968), ('G', 36989), ('D', 37100), ('E', 37147), ('P', 37189)]\n",
      "[('N', 36921), ('J', 36963), ('O', 37176), ('[', 37147), ('H', 37025), ('B', 37037), ('R', 36973), ('W', 37438), ('C', 36989), ('Z', 37157), ('V', 36778), ('X', 36705), ('I', 37165), ('K', 36993), ('L', 37151), ('Q', 37044), ('T', 36665), ('U', 37025), ('A', 37126), ('F', 36657), ('M', 37224), ('S', 37248), ('Y', 36968), ('G', 36989), ('D', 37100), ('E', 37147), ('P', 37189)]\n"
     ]
    }
   ],
   "source": [
    "rdd2 = rdd.map(lambda a : (a,1))\n",
    "s = datetime.datetime.now()\n",
    "res = rdd2.reduceByKey(lambda a,b : a+b).collect()\n",
    "rt1 = datetime.datetime.now() -s \n",
    "print(rt1)\n",
    "print(\"-------------\")\n",
    "s = datetime.datetime.now()\n",
    "res2 = rdd2.reduceByKey(lambda a,b : a+b).collect()\n",
    "rt2 = datetime.datetime.now() -s \n",
    "print(rt2)\n",
    "print(\"-------------\")\n",
    "print(rt1 + rt2)\n",
    "print(\"#############\")\n",
    "print(res)\n",
    "print(res2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 第二种写法，map+reduceByKey每次都生成两个不同的rdd，但是结果（执行时间大致）是一样的"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0:00:10.230988\n",
      "-------------\n",
      "0:00:10.221805\n",
      "-------------\n",
      "0:00:20.452793\n",
      "#############\n",
      "[('N', 36921), ('J', 36963), ('O', 37176), ('[', 37147), ('H', 37025), ('B', 37037), ('R', 36973), ('W', 37438), ('C', 36989), ('Z', 37157), ('V', 36778), ('X', 36705), ('I', 37165), ('K', 36993), ('L', 37151), ('Q', 37044), ('T', 36665), ('U', 37025), ('A', 37126), ('F', 36657), ('M', 37224), ('S', 37248), ('Y', 36968), ('G', 36989), ('D', 37100), ('E', 37147), ('P', 37189)]\n",
      "[('N', 36921), ('J', 36963), ('O', 37176), ('[', 37147), ('H', 37025), ('B', 37037), ('R', 36973), ('W', 37438), ('C', 36989), ('Z', 37157), ('V', 36778), ('X', 36705), ('I', 37165), ('K', 36993), ('L', 37151), ('Q', 37044), ('T', 36665), ('U', 37025), ('A', 37126), ('F', 36657), ('M', 37224), ('S', 37248), ('Y', 36968), ('G', 36989), ('D', 37100), ('E', 37147), ('P', 37189)]\n"
     ]
    }
   ],
   "source": [
    "s = datetime.datetime.now()\n",
    "rdd2 = rdd.map(lambda a : (a,1)).reduceByKey(lambda a,b : a+b)\n",
    "res = rdd2.collect()\n",
    "rt1 = datetime.datetime.now() -s \n",
    "print(rt1)\n",
    "print(\"-------------\")\n",
    "s = datetime.datetime.now()\n",
    "rdd3 = rdd.map(lambda a : (a,1)).reduceByKey(lambda a,b : a+b)\n",
    "res2 = rdd3.collect()\n",
    "rt2 = datetime.datetime.now() -s \n",
    "print(rt2)\n",
    "print(\"-------------\")\n",
    "print(rt1 + rt2)\n",
    "print(\"#############\")\n",
    "print(res)\n",
    "print(res2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
